home *** CD-ROM | disk | FTP | other *** search
Wrap
import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.StringTokenizer; import java.util.Vector; public class Parser { protected Channel channel = new Channel(); protected Vector html = new Vector(500, 100); private int headlinesFound = 0; private String headlines = ""; public Parser() { } public Parser(Channel theChannel) throws Exception { try { this.channel = theChannel; this.html = this.getPage(this.channel.getSource(), false); } catch (Exception var3) { throw new Exception("Could not read the HTML file."); } } public Parser(String theChannel, String htmlFile) throws Exception { try { this.channel = (Channel)IOUtil.openObject(theChannel); } catch (Exception var5) { throw new Exception("Could not open channel " + theChannel); } try { this.html = this.getPage(htmlFile, false); } catch (Exception var4) { throw new Exception("Could not open HTML file " + htmlFile); } } protected void saveToCache(String fileName) throws Exception { String temp = ""; for(int i = 0; i < this.html.size(); ++i) { temp = temp + (String)this.html.elementAt(i); } try { IOUtil.saveTextFile(fileName, temp, false); } catch (Exception var5) { throw new Exception("Parser: Error saving cache file " + fileName + "\n" + var5); } } private String filterTags(String source) { String temp = ""; boolean ignore = false; for(int i = 0; i < source.length(); ++i) { if (!ignore & source.charAt(i) != '<' & source.charAt(i) != '>') { temp = temp + source.charAt(i); } if (source.charAt(i) == '<') { ignore = true; } else if (source.charAt(i) == '>') { ignore = false; } } temp = this.charConvert(temp); return temp; } protected String charConvert(String source) { String[] charList = new String[]{"", "#146;", "", "&", " ", """, "’", "–", "“", "”", "£", "~", "¨", "©", "", "<", ">", "£", "'"}; String[] replaceList = new String[]{"\"", "'", "\"", "&", " ", "\"", "'", "-", "\"", "\"", "¬£", "~", "\"", "¬©", " ", "<", ">", "¬£", "'"}; for(int i = 0; i < source.length(); ++i) { for(int n = 0; n < charList.length; ++n) { int wordLength = charList[n].length(); if (i + wordLength < source.length() && source.substring(i, i + wordLength).equalsIgnoreCase(charList[n])) { source = source.substring(0, i) + replaceList[n] + source.substring(i + wordLength, source.length()); } } } return source; } protected String[] findHeadlines() throws Exception { int number = this.channel.getHeadlineCount(); String[] returnString = new String[number]; boolean headline = false; String htmlData = ""; String openingMarker = this.channel.getOpeningMarker().toLowerCase(); String closingMarker = this.channel.getClosingMarker().toLowerCase(); String startMarker = this.channel.getStartMarker().toLowerCase(); String endMarker = this.channel.getEndMarker().toLowerCase(); int n = 0; int startLength = startMarker.length(); int endLength = endMarker.length(); int openingMarkerLength = openingMarker.length(); int closingMarkerLength = closingMarker.length(); int startingPosition = 0; boolean headlineSection = false; String carryOver = ""; for(int lineCount = 0; lineCount < this.html.size(); ++lineCount) { String thisLine = ((String)this.html.elementAt(lineCount)).trim(); if (n < number) { for(int i = 0; i < thisLine.length(); ++i) { if (n < number) { try { if (i + startLength <= thisLine.length() & !startMarker.equals("") && thisLine.substring(i, i + startLength).equalsIgnoreCase(startMarker)) { headlineSection = true; i += startLength; } } catch (Exception var25) { throw new Exception("Parser: Failed locating start marker."); } try { if (i + endLength <= thisLine.length() & !endMarker.equals("") && thisLine.substring(i, i + endLength).equalsIgnoreCase(endMarker)) { headlineSection = false; } } catch (Exception var24) { throw new Exception("Parser: Failed locating end marker.\n" + var24); } if (startMarker.equals("")) { headlineSection = true; } try { if (i + openingMarkerLength <= thisLine.length() & headlineSection && thisLine.substring(i, i + openingMarkerLength).equalsIgnoreCase(openingMarker)) { startingPosition = i + openingMarkerLength; headline = true; } } catch (Exception var23) { throw new Exception("Parser: Failed locating opening marker."); } try { if (i + closingMarkerLength <= thisLine.length() & headlineSection && thisLine.substring(i, i + closingMarkerLength).equalsIgnoreCase(closingMarker)) { if (!carryOver.equals("")) { try { returnString[n] = this.filterTags(carryOver + thisLine.substring(0, i)).trim(); ++n; } catch (Exception var22) { throw new Exception("Parser: Processing carry over headline."); } } else { try { if (startingPosition < i & i < thisLine.length()) { String theString = thisLine.substring(startingPosition, i); String temp = this.filterTags(theString); if (!this.filterTags(temp).equals("")) { returnString[n] = temp; ++n; } } } catch (Exception var27) { throw new Exception("Parser: Processing normal headline failed."); } } carryOver = ""; startingPosition = 0; headline = false; } } catch (Exception var28) { throw new Exception("Parser: Failed processing closing marker."); } try { if (i == thisLine.length() - 1 & headline && startingPosition < i) { if (carryOver.equals("")) { carryOver = carryOver + thisLine.substring(startingPosition, i + 1) + " "; } else { carryOver = carryOver + thisLine; } } } catch (Exception var26) { throw new Exception("Parsing error: Failed managing an overflow line."); } } } } } return returnString; } protected void getHTML() throws Exception { try { this.html = this.getPage(this.channel.getSource(), false); } catch (Exception var2) { throw new Exception("Error downloading " + this.channel.getName()); } } protected Vector getPage(String theURL, boolean parse) throws Exception { try { URL theSite = new URL(theURL); BufferedReader in = new BufferedReader(new InputStreamReader(theSite.openStream())); Vector tempVector = new Vector(200, 10); int p = 0; if (parse) { throw new Exception("This function is not yet available!"); } else { String inputLine; while((inputLine = in.readLine()) != null) { tempVector.add(inputLine); } in.close(); return tempVector; } } catch (Exception var8) { throw new Exception("Could not get channel from " + theURL); } } protected String[] getHeadlines() { StringTokenizer st = new StringTokenizer(this.headlines, "~"); int n = 0; String[] temp; for(temp = new String[this.channel.getHeadlineCount()]; n < this.channel.getHeadlineCount() & st.hasMoreTokens(); ++n) { temp[n] = st.nextToken(); } return temp; } }